##############################################
# The Archipelago Capitalism of Citizenship-by-Investment
# Jelena Dzankic, Mira Seyfettinoglu, Ayelet Shachar, Maarten Vink, Luuk van der Baaren
# Comparative Political Studies, accepted for publication, October 2025
#
# Code for analyses
# Code prepared by Maarten Vink with research assistance by Tarek Jaziri Arjona
##############################################

#start with clean workspace
rm(list=ls(all=TRUE))

#load packages
library(tidyverse)
library(DataExplorer)
library(stargazer)
library(brglm)
library(writexl)
library(skimr)
library(vtable)
library(lme4)
library(sandwich)
library(pglm)
library(sjPlot)
library(jtools)
library(glmmTMB)
library(ggpubr)
library(margins)
library(coefplot)
library(glmmPen)
library(gridExtra)
library(grid)
library(effects, quietly = TRUE)
library(ggeffects)
library(marginaleffects)
library(broom.helpers)
library(gtsummary)
#devtools::install_github('IQSS/Zelig')
library(Zelig)
library(gt)
library(corrplot)
library(PerformanceAnalytics)

# read the dataset and add variables and labels  
data_models <- read.csv("01_data/cbi_dat_cens.csv", 
                        stringsAsFactors = FALSE, na.strings = "") 
data_models[data_models == "NA"] <- NA
summary(data_models) 
dim(data_models) #10546 obs of 40 vars

# read the imputed dataset and add variables and labels  
data_models_imp <- read.csv("01_data/cbi_dat_imp_cens.csv", 
                            stringsAsFactors = FALSE, na.strings = "")
data_models_imp[data_models_imp == "NA"] <- NA
dim(data_models_imp) #10546 obs of 40 vars

# Data prep
data_models$cbi_intro <- as.numeric(data_models$cbi_intro)
data_models_imp$cbi_intro <- as.numeric(data_models_imp$cbi_intro)
data_models$cbi_remov <- as.numeric(data_models$cbi_remov)
data_models_imp$cbi_remov <- as.numeric(data_models_imp$cbi_remov)
data_models$cbi_gen_intro <- as.numeric(data_models$cbi_gen_intro)
data_models_imp$cbi_gen_intro <- as.numeric(data_models_imp$cbi_gen_intro)
data_models$cbi_gen_remov <- as.numeric(data_models$cbi_gen_remov)
data_models_imp$cbi_gen_remov <- as.numeric(data_models_imp$cbi_gen_remov)
data_models$year <- as.numeric(data_models$year)
data_models_imp$year <- as.numeric(data_models_imp$year)
data_models$microstate_cat <- ifelse(data_models$microstate == "1", "Yes", "No")
data_models_imp$microstate_cat <- ifelse(data_models_imp$microstate == "1", "Yes", "No")
data_models$late_indep_cat <- ifelse(data_models$late_indep == "1", "Yes", "No")
data_models_imp$late_indep_cat <- ifelse(data_models_imp$late_indep == "1", "Yes", "No")
data_models$cbi_gen_intro <- as.numeric(data_models$cbi_gen_intro)
data_models_imp$cbi_gen_intro <- as.numeric(data_models_imp$cbi_gen_intro)
data_models$middle_income_cat <- ifelse(data_models$income == "Lower middle income" | data_models$income == "Upper middle income", "Yes", "No")
data_models_imp$middle_income_cat <- ifelse(data_models_imp$income == "Lower middle income" | data_models_imp$income == "Upper middle income", "Yes", "No")
data_models$gdp_norm  <- as.numeric(data_models$gdp_norm)
data_models_imp$gdp_norm  <- as.numeric(data_models_imp$gdp_norm)
data_models$fh_status_free_cat <- ifelse(data_models$fh_status == "Free", "Yes", "No")
data_models_imp$fh_status_free_cat <- ifelse(data_models_imp$fh_status == "Free", "Yes", "No")
data_models$fh_norm  <- as.numeric(data_models$fh_norm)
data_models_imp$fh_norm <- as.numeric(data_models_imp$fh_norm)
data_models$v2x_regime_bin <- ifelse(data_models$v2x_regime == "2" | data_models$v2x_regime == "3", "Yes", "No")
data_models_imp$v2x_regime_bin <- ifelse(data_models_imp$v2x_regime == "2" | data_models_imp$v2x_regime == "3", "Yes", "No")
data_models$common_law_cat <- ifelse(data_models$common_law == "1", "Yes", "No")
data_models_imp$common_law_cat <- ifelse(data_models_imp$common_law == "1", "Yes", "No")
data_models$haven_cat <- ifelse(data_models$haven == "1", "Yes", "No")
data_models_imp$haven_cat <- ifelse(data_models_imp$haven == "1", "Yes", "No")
data_models_imp$late_independence_cat <- factor(data_models_imp$late_independence_cat,
                                                  levels = c("Independent by 1960", "Independent 1961 – 1988", "Independent post-1988"))
data_models$late_independence_cat <- factor(data_models$late_independence_cat,
                                                levels = c("Independent by 1960", "Independent 1961 – 1988", "Independent post-1988"))

# set key variables as factors
data_models_imp$haven_cat <- as.factor(data_models_imp$haven_cat)
data_models_imp$microstate_cat <- as.factor(data_models_imp$microstate_cat)
data_models_imp$middle_income_cat <- as.factor(data_models_imp$middle_income_cat)
data_models_imp$common_law_cat <- as.factor(data_models_imp$common_law_cat)
data_models_imp$fh_status_free_cat <- as.factor(data_models_imp$fh_status_free_cat)

# Set the reference level to "No" for both variables
# use factors as independent variables to ensure predicted probabilities are plotted correctly
data_models_imp$haven_cat <- relevel(as.factor(data_models_imp$haven_cat), ref = "No")
data_models_imp$microstate_cat <- relevel(as.factor(data_models_imp$microstate_cat), ref = "No")

## we filter out the remaining missing values to ensure same N across models
data_models <- data_models |>
  filter(!is.na(fh_status_free_cat)) |>
  filter(!is.na(middle_income_cat))
# summary(data_models) #8527 obs
summary(data_models) #8527 obs
dim(data_models) #8527 obs of 47 vars

data_models_imp <- data_models_imp |>
  filter(!is.na(fh_status_free_cat)) |>
  filter(!is.na(middle_income_cat))
# summary(data_models_imp) #8931 obs
summary(data_models_imp) #8931 obs
dim(data_models_imp) #8931 obs of 47 vars


########## SUMMARY STATISTICS ########## 

summary_stats <- data_models_imp |>
  select(c(cbi_intro, year,
           haven_cat, microstate_cat, middle_income_cat, common_law_cat, fh_status_free_cat, late_indep_cat))

# label
var <- c("cbi_intro", 
         "haven_cat", "microstate_cat", "middle_income_cat",
         "common_law_cat","fh_status_free_cat",
         "late_indep_cat", "year")
label <-c("CBI introduction", 
          "Tax haven","Microstate", "Middle income country",
          "Common law tradition", "Free (FH status)", "Late independence (after 1960)", "Year")
df_label <- data.frame(var,label)
st(summary_stats, labels = df_label)

st(summary_stats,
   labels = df_label,
   out = "latex",
   file = "02_plots_tables/SM2.summary.stats.tex")

# -------------------------------------------------------------------------------------------------------
########## MAIN MODELS ########## 

# BRGLM model to address limited DV with bias reduction
# see: https://search.r-project.org/CRAN/refmans/brglm/html/brglm.html
glmbr <- brglm(cbi_intro ~ haven_cat + microstate_cat +
                middle_income_cat + common_law_cat + fh_status_free_cat +
                year, 
                family=binomial(link="logit"), 
               method = "brglm.fit", p1 = T, # Model should be fitted using maximum penalized likelihood, where the penalization is done using Jeffreys invariant prior
               data = data_models_imp)
summary(glmbr)

##### King and Zeng rare events logistic model ##### 
z.out2 <- zelig(cbi_intro ~ haven_cat + microstate_cat +
                  middle_income_cat + common_law_cat + fh_status_free_cat +
                  year,
                data = data_models_imp, model = "relogit", 
                tau = 0.0036, #proportion of CBI introductions (see SM2)
                case.control = "weighting") #The “weighting” method performs a weighted logistic regression to correct for a case-control sampling design. 
summary(z.out2)
# Extract the underlying model from the Zelig object
model_relogit <- z.out2$zelig.out$z.out[[1]]
model_relogit

# GLMM model
glmm <- glmer(cbi_intro ~ haven_cat + microstate_cat + 
                middle_income_cat + common_law_cat + fh_status_free_cat + 
                year +
                (1|iso3c), 
              data = data_models_imp, family = binomial(link = "logit"),
              control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun=2e5)))
# boundary (singular) fit: see help('isSingular')
# singles that model is overfitted
summary(glmm)


# export regression output table
stargazer(glmbr, model_relogit, glmm,
          type = "text", omit = "year",
          dep.var.caption = "Adoption of CBI",
          dep.var.labels.include = FALSE,
          header = F,
          title = "",
          star.char = c("+", "*", "**", "***"),
          star.cutoffs = c(.1, .05, .01, .001),
          notes = c("+ p$<$0.1; * p$<$0.05; ** p$<$0.01; *** p$<$0.001"),
          notes.append=FALSE,
          add.lines = list(c("Year (linear effect)", "Y", "Y", "Y"), 
                           c("Country random intercept", "-", "-", "Y")),
          covariate.labels = c("Tax haven", "Microstate", "Middle income country",
                               "Common law", "Free (FH status)"),
          out = c("02_plots_tables/SM6.main_models_glmbr_relogit_glmm_cbi.txt", 
                  "02_plots_tables/SM6.main_models_glmbr_relogit_glmm_cbi.tex"))


###############################
### FIG 3: PREDICTED VALUES ###
###############################

# TAX HAVEN
e1 <- ggeffect(glmbr, "haven_cat")
print(e1, digits = 4)
# haven_cat | Predicted |         95% CI
# --------------------------------------
# No        |    0.0018 | 0.0010, 0.0030
# Yes       |    0.0069 | 0.0033, 0.0148

#calculate AME's and significance
marginal_haven <- summary(margins(glmbr, variables = "haven_cat", type = "link"))
marginal_haven
# factor        AME     SE      z      p        lower  upper
# haven_catYes 1.3718 0.4261 3.2194 0.0013 0.5367 2.2070

p1 <- plot(e1) + ggtitle("Tax haven") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) + 
  # log transform Y-axis to obtain symmetric confidence intervals
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , 
              round(exp(marginal_haven$AME), 2), sep = "**")))
p1


# Microstate
e2 <- ggeffect(glmbr, "microstate_cat")
print(e2, digits = 4)
# microstate_cat | Predicted |         95% CI
# -------------------------------------------
# No             |    0.0018 | 0.0010, 0.0031
# Yes            |    0.0043 | 0.0021, 0.0089

marginal_small <- summary(margins(glmbr, variables = "microstate_cat", type = "link"))
marginal_small
# factor              AME     SE      z      p      lower  upper
# microstate_catYes 0.8790 0.4356 2.0180 0.0436 0.0253 1.7327

p2 <- plot(e2) + ggtitle("Microstate") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , 
                          round(exp(marginal_small$AME), 2), "*")))
p2

# MIDDLE INCOME
e3 <- ggeffect(glmbr, "middle_income_cat")
print(e3, digits = 4)
# middle_income_cat | Predicted |         95% CI
# ----------------------------------------------
# No                |    0.0013 | 0.0006, 0.0026
# Yes               |    0.0034 | 0.0020, 0.0057

marginal_income <- summary(margins(glmbr, variables = "middle_income_cat", type = "link"))
marginal_income
# factor                AME     SE      z      p    lower  upper
# middle_income_catYes  0.9773 0.3785 2.5823 0.0098 0.2355 1.7191

p3 <- plot(e3) + ggtitle("Middle income country") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , round(exp(marginal_income$AME), 2), sep = "**")))
p3


# COMMON LAW
e4 <- ggeffect(glmbr, "common_law_cat")

marginal_law <- summary(margins(glmbr, variables = "common_law_cat", type = "link"))
marginal_law
# factor    AME     SE      z             p   lower  upper
# common_law_catYes 0.6057 0.4022 1.5058 0.1321 -0.1827 1.3940

p4 <- plot(e4) + ggtitle("Common law tradition") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , round(exp(marginal_law$AME), 2), sep = "")))
p4


## FREEDOM HOUSE
e5 <- ggeffect(glmbr, "fh_status_free_cat")

marginal_fh <- summary(margins(glmbr, variables = "fh_status_free_cat", type = "link"))
marginal_fh
# factor                AME     SE      z     p       lower  upper
# fh_status_free_catYes 0.1441 0.3976 0.3625 0.7170 -0.6352 0.9234

p5 <- plot(e5) + ggtitle("Free (FH status)") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , round(exp(marginal_fh$AME), 2), sep = "")))
p5


# YEAR
e6 <- ggeffect(glmbr, "year")

marginal_year <- summary(margins(glmbr, variables = "year", type = "link"))
marginal_year
# factor    AME     SE      z      p    lower  upper
# year      0.0272 0.0129 2.1156 0.0344 0.0020 0.0524

p6 <- plot(e6) + ggtitle("Year") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1990, y = .0075, label = paste0("AME(exp) = " , round(exp(marginal_year$AME), 2), sep = "*")))
p6
# & scale_x_continuous(breaks = c(1970, 1990, 2010)) &

# combine
patchwork::wrap_plots(p1, p2, p3, p4, p5, p6) + patchwork::plot_annotation(
  title = "Predicted probability of CBI introduction",
  subtitle = "Marginal predictions at the mean (and average marginal effect, exponentiated)",
  caption = "difference between predicted probabilities significant at +p<.1 *p<.05 **p<.01 ***p<.001"
)

ggsave("02_plots_tables/Fig3.coef_plot_main_model_cbi_intro_glmbr.jpeg", dpi=600, units = "cm", width = 20)


###############################
### FIG 4: PREDICTED VALUES ###
###############################

#Fig 4a: pred prob of haven by income, by year
Fig4a <- plot_predictions(glmbr, by = c("year", "haven_cat", "middle_income_cat"), 
                          type = "response") + ylim(-0.005,0.06) + 
  theme_minimal() +
  labs(
    x = "",  # Custom x-axis label
    y = "predicted probability\n",  # Custom y-axis label
    fill = "Tax Haven", color = "Tax Haven"
  ) +
  theme(
    legend.position = "top"  # Moves legend to the top
  ) +
  scale_color_manual(values = c("No" = "#cc2936", "Yes" = "#264653")) +  # Sets the line colors
  scale_fill_manual(values = c("No" = "#cc2936", "Yes" = "#264653")) +     # Sets the fill colors
  scale_y_continuous(labels = scales::label_percent())+
  facet_grid(. ~ middle_income_cat, labeller = labeller(
    middle_income_cat = c("No" = "Low or High Income Country", "Yes" = "Middle Income Country")
  ))
Fig4a

#Fig 4b: pred prob of haven by income
Fig4b <-plot_predictions(glmbr, by = c("haven_cat", "middle_income_cat"), type = "response") + ylim(-0.005,0.06) + theme_minimal() +
  labs(
    x = "\nMiddle income country",  # Custom x-axis label
    y = "predicted probability\n",  # Custom y-axis label
    fill = "Tax Haven", color = "Tax Haven"
  ) +
  theme(
    legend.position = "top"  # Moves legend to the top
  ) +
  scale_color_manual(values = c("No" = "#cc2936", "Yes" = "#264653")) +  # Sets the line colors
  scale_fill_manual(values = c("No" = "#cc2936", "Yes" = "#264653")) +     # Sets the fill colors
  scale_y_continuous(labels = scales::label_percent())

# Fig 4c: with microstate and tax haven, by year
Fig4c <-plot_predictions(glmbr, by = c("year", "haven_cat", "microstate_cat"), type = "response") + ylim(-0.005,0.06) + theme_minimal() +
  labs(
    x = "",  # Custom x-axis label
    y = "predicted probability\n",  # Custom y-axis label
    fill = "Tax Haven", color = "Tax Haven"
  ) +
  theme(
    legend.position = "top"  # Moves legend to the top
  ) +
  scale_color_manual(values = c("No" = "#cc2936", "Yes" = "#264653")) +  # Sets the line colors
  scale_fill_manual(values = c("No" = "#cc2936", "Yes" = "#264653")) +     # Sets the fill colors
  scale_y_continuous(labels = scales::label_percent())+
  facet_grid(. ~ microstate_cat, labeller = labeller(
    microstate_cat = c("No" = "No microstate", "Yes" = "Microstate")
  ))

# Fig 4d: with microstate and tax haven, by year
Fig4d <- plot_predictions(glmbr, by = c("haven_cat", "microstate_cat"), type = "response") + ylim(-0.005,0.06) + theme_minimal() +
  labs(
    x = "\nMicrostate",  # Custom x-axis label
    y = "predicted probability\n",  # Custom y-axis label
    fill = "Tax Haven", color = "Tax Haven"
  ) +
  theme(
    legend.position = "top"  # Moves legend to the top
  ) +
  scale_color_manual(values = c("No" = "#cc2936", "Yes" = "#264653")) +  # Sets the line colors
  scale_fill_manual(values = c("No" = "#cc2936", "Yes" = "#264653")) +     # Sets the fill colors
  scale_y_continuous(labels = scales::label_percent())

# save combi plot
jpeg(file="02_plots_tables/Fig4.pred.prob.combi.jpeg", width=2000, height=1200, res=200)
grid.arrange(Fig4a, Fig4b, Fig4c, Fig4d, nrow = 2, widths = c(0.6, 0.4))
dev.off()


###############################################################################  
###############################################################################
###############################################################################

###############################
### ROBUSTNESS ###
###############################


##### BRGLM model: stepwise ##### 

#empty model with only year
glmbr0 <- brglm(cbi_intro ~ year, 
               family=binomial(link="logit"), 
               method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr0)

# tax haven
glmbr1a <- brglm(cbi_intro ~ haven_cat + year, 
                family=binomial(link="logit"), 
                method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr1a)

# only small country
glmbr1b <- brglm(cbi_intro ~ microstate_cat + year, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr1b)

# only income
glmbr1c <- brglm(cbi_intro ~ middle_income_cat + year, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr1c)

# common law
glmbr1d <- brglm(cbi_intro ~ common_law_cat + year, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr1d)

# only regime
glmbr1e <- brglm(cbi_intro ~ fh_status_free_cat + year, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr1e)

# late independence
glmbr1f <- brglm(cbi_intro ~ late_independence_cat + year, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr1f)

# full model
glmbr1g <- brglm(cbi_intro ~ haven_cat + microstate_cat + middle_income_cat +
                   common_law_cat + fh_status_free_cat + late_independence_cat + 
                   year, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr1g)


######
# neat output table
stargazer(glmbr1a, glmbr1b, glmbr1c, glmbr1d, glmbr1e, glmbr1f, glmbr1g,
          type = "text", omit = "year",
          dep.var.caption = "Adoption of CBI",
          dep.var.labels.include = FALSE,
          header = F,
          title = "",
          add.lines = list(c("Linear Year FE", "Y", "Y", "Y", "Y","Y","Y","Y")),
          covariate.labels = c("Tax haven", "Microstate", "Middle income country",
                               "Common law", "FH Free", 
                               "Independendence (ref: pre-1961):-1961–1988", "-post-1988"),
          star.char = c("+", "*", "**", "***"),
          star.cutoffs = c(.1, .05, .01, .001),
          notes.append=FALSE,
          notes = c("+ p$<$0.1; * p$<$0.05; ** p$<$0.01; *** p$<$0.001"),
          out = c("02_plots_tables/SM9.stepw_models_glmbr_cbi_intro_free.txt", 
                  "02_plots_tables/SM9.stepw_models_glmbr_cbi_intro.tex"))



##### with odds ratios ##### 
models <- list(glmbr)
sjPlot::tab_model(
  models,
  dv.labels = c("CBI introduction"),
  pred.labels = c(microstate_cat = 'Microstate',
                  common_law_cat = 'Common law',
                  middle_income_cat = 'Middle income',
                  fh_status_free_cat = 'FH Free',
                  haven_cat = "Tax haven",
                  year = "Year"),
  show.r2 = TRUE,
  show.icc = TRUE,
  show.aic = T,
  show.loglik = T,
  show.re.var = T,
 # p.style = "scientific",
  emph.p = TRUE,
  file = "02_plots_tables/temp.html")


# Make latex table
# Now we can read the .html file and clean it up a bit:
library(rlist)
library(XML)
tables <- list.clean(readHTMLTable("02_plots_tables/temp.html"), fun = is.null, recursive = FALSE)
tables2 = tables[[1]] %>% janitor::row_to_names(row_number = 1)
tables2 <- as.matrix(tables2) %>% as_tibble()
tables2[is.na(tables2)] <- ""

#So now we have the html table in a "clean" dataframe, and we can use kable() to see it in the terminal:
knitr::kable(tables2, format = "pipe")

# With this final kable() call we can create the latex code below, 
# which is a reasonable approximation to the initial table... 
# although some important things are missing (bold p values, VD top row...)
SM7 <- kable(
  tables2,
  format = "latex",
  booktabs = TRUE,
#  col.names = names(tables2),
  col.names = c("Predictors", "Odds Ratios", "CI", "p"),
  align = c("l", rep("c", length(names(tables2)) - 1)),
  caption = "Likelihood of CBI introduction: odds ratios (and 95 percent confidence intervals) from generalized mixed effects model (GLMM) with country random intercepts and generalized linear model with bias reduction (GLMBR).")
# save in .tex
writeLines(SM7, '02_plots_tables/SM7.Table_main_model_cbi_intro_OR.glmbr.tex')


###### Robustness: check adding dual citizenship data ##### 
## use file 'data_v2.0_country-year.csv' downloaded from https://hdl.handle.net/1814/73190
data_v2_0_country_year <- read_csv("data_v2.0_country-year.csv")

data_v2_0_country_year <- data_v2_0_country_year %>%
  select(iso3, year, A06b_bin, L05_bin) %>%
  rename(iso3c = iso3)

data_models_imp_dc <- data_models_imp %>%
  left_join(data_v2_0_country_year, by = c("iso3c", "year"))
  
data_models_imp_dc$A06b_bin[data_models_imp_dc$A06b_bin == 9] <- NA
data_models_imp_dc <- data_models_imp_dc %>%
  drop_na(A06b_bin)
data_models_imp_dc <- data_models_imp_dc %>%
  drop_na(L05_bin)

## SM14: adding DC variables to main model - in two separate models
glmbr_dc1 <- brglm(cbi_intro ~ haven_cat + microstate_cat + middle_income_cat +
                   common_law_cat + fh_status_free_cat + 
                   year + A06b_bin, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models_imp_dc)
glmbr_dc1
glmbr_dc2 <- brglm(cbi_intro ~ haven_cat + microstate_cat + middle_income_cat +
                     common_law_cat + fh_status_free_cat + 
                     year + L05_bin, 
                   family=binomial(link="logit"), 
                   method = "brglm.fit", p1 = T, data = data_models_imp_dc)


# neat output table with dc
stargazer(glmbr_dc1, glmbr_dc2,
          type = "text", omit = "year",
          dep.var.caption = "Adoption of CBI",
          dep.var.labels.include = FALSE,
          header = F,
          title = "",
          add.lines = list(c("Linear Year FE", "Y", "Y")),
          covariate.labels = c("Tax haven", "Microstate", "Middle income country",
                               "Common law", "Free (FH status)",
                               "Dual citizenship (in country)", "Dual citizenship (abroad)"),
          star.char = c("+", "*", "**", "***"),
          star.cutoffs = c(.1, .05, .01, .001),
          notes.append=FALSE,
          notes = c("+ p$<$0.1; * p$<$0.05; ** p$<$0.01; *** p$<$0.001"),
          out = c("02_plots_tables/SM14.DC_models_glmbr_cbi_intro.txt", 
                  "02_plots_tables/SM14.DC_models_glmbr_cbi_intro.tex"))


########## MORE ROBUSTNESS MODELS ########## 

# Compare models with DV: with cbi_intro and data with and without imputation

# main model with imputation
# Baseline GLMM model
brglm1a <- brglm(cbi_intro ~ haven_cat + microstate_cat + middle_income_cat +
                   common_law_cat + fh_status_free_cat + 
                   year, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models_imp)

# main model without imputation
# Baseline GLMM model
brglm1b <- brglm(cbi_intro ~ haven_cat + microstate_cat + middle_income_cat +
                   common_law_cat + fh_status_free_cat + 
                   year, 
                 family=binomial(link="logit"), 
                 method = "brglm.fit", p1 = T, data = data_models)

# Table
stargazer(brglm1a, brglm1b,
          type = "text", omit = c("year"),
          dep.var.caption = "Adoption of CBI",
          dep.var.labels.include = FALSE,
          header = F,
          model.names = FALSE,
          column.labels = c("with imputation", "without imputation"),
          title = "",
          add.lines = list(c("Year FE", "Y", "Y"),
                           c("Imputation", "Y", "N")),
          covariate.labels = c("Tax haven", "Microstate","Middle income",
                               "Common law","Free (FH status"),
          star.char = c("+", "*", "**", "***"),
          star.cutoffs = c(.1, .05, .01, .001),
          notes.append=FALSE,
          notes = c("+ p$<$0.1; * p$<$0.05; ** p$<$0.01; *** p$<$0.001"),
          out = c("02_plots_tables/SM10.ROB_models_cbi_intro_glmbr_imp.txt", 
                  "02_plots_tables/SM10.Rob_models_cbi_intro_glmbr_imp.tex"))



# third robustness test: continuous variables of GDP and FH
glmbr1a_cont <- brglm(formula = cbi_intro ~ haven_cat + microstate_cat + 
                        common_law_cat + gdp_norm + I(gdp_norm^2) + fh_status_free_cat +
                        year, 
                      family = binomial(link = "logit"), data = data_models_imp, 
                      method = "brglm.fit", p1 = T)

glmbr1b_cont <- brglm(formula = cbi_intro ~ haven_cat + microstate_cat + 
                        common_law_cat + middle_income_cat +
                        + fh_norm + I(fh_norm^2) + year, 
                      family = binomial(link = "logit"), data = data_models_imp, 
                      method = "brglm.fit", p1 = T)

# output table
stargazer(glmbr1a_cont, glmbr1b_cont,
          type = "text", omit = c("year"),
          dep.var.caption = "Adoption of CBI",
          dep.var.labels.include = FALSE,
          header = F,
          model.names = FALSE,
          column.labels = c("with cont' GDP", "with cont' FH"),
          title = "",
          add.lines = list(c("Year FE", "Y", "Y")),
          covariate.labels = c("Tax haven", "Microstate", "Common law",
                               "Income Cont.", "Sqrd. Income Cont.","Free (FH status)",
                               "Middle income country", "FH Cont.", "Sqrd. FH Cont."),
          star.char = c("+", "*", "**", "***"),
          star.cutoffs = c(.1, .05, .01, .001),
          notes.append=FALSE,
          notes = c("+ p$<$0.1; * p$<$0.05; ** p$<$0.01; *** p$<$0.001"),
          out = c("02_plots_tables/SM13.ROB_cont_cbi_intro_glmbr.txt", 
                  "02_plots_tables/SM13.Rob_cont_cbi_intro_glmbr.tex"))



# fourth robustness test: jackknife of countries
# with glmbr 
unique_countries <- unique(data_models_imp$iso3c)

results_df_brglm <- data.frame(country_excluded = character(), variable = character(), estimate = numeric(), std.error = numeric(), stringsAsFactors = FALSE)

for(country in unique_countries) {
  temp_data <- data_models_imp[data_models_imp$iso3c != country,]
  model_brglm <- brglm(cbi_intro ~ haven_cat + microstate_cat + middle_income_cat + common_law_cat + fh_status_free_cat + year, family = binomial(link = "logit"), method = "brglm.fit", p1 = T, data = temp_data)
  
  # Assuming direct extraction of estimates and standard errors
  coefs <- summary(model_brglm)$coefficients  # Adjust as necessary to extract correctly
  
  # Append to results_df_brglm
  for(var in rownames(coefs)) {
    results_df_brglm <- rbind(results_df_brglm, data.frame(country_excluded = country, variable = var, estimate = coefs[var, "Estimate"], std.error = coefs[var, "Std. Error"]))
  }
}

# Filter results for haven
haven_df <- results_df_brglm[results_df_brglm$variable == "haven_catYes",]

# Calculate confidence intervals
haven_df$conf.low <- haven_df$estimate - (1.96 * haven_df$std.error)
haven_df$conf.high <- haven_df$estimate + (1.96 * haven_df$std.error)
haven_df$conf.low.1 <- haven_df$estimate - (1.64 * haven_df$std.error)
haven_df$conf.high.1 <- haven_df$estimate + (1.64 * haven_df$std.error)

haven_graph <- ggplot(haven_df, aes(x = country_excluded, y = estimate)) +
  geom_point() +
  # 90% confidence interval (thicker lines)
  geom_pointrange(aes(ymin = conf.low.1, ymax = conf.high.1), 
                  position = position_dodge(width = 0.3), size = 0.75, linewidth = 1.25) +
  
  # 95% confidence interval (thinner lines)
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high), 
                  position = position_dodge(width = 0.3), size = 0.75, linewidth = 0.75) +
  theme_minimal() +
  coord_flip() +
  geom_hline(yintercept = 0, linetype ="dashed") +
  labs(title = "Tax haven",
       x = "country excluded", y = "") +
  theme(axis.text.y = element_text(size = 6)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 
haven_graph
# Note not significant haven effect when removing Tonga: indicates some sensitivity for this variable

# Filter results for microstate
microstate_df <- results_df_brglm[results_df_brglm$variable == "microstate_catYes",]

# Calculate confidence intervals
microstate_df$conf.low <- microstate_df$estimate - (1.96 * microstate_df$std.error)
microstate_df$conf.high <- microstate_df$estimate + (1.96 * microstate_df$std.error)
microstate_df$conf.low.1 <- microstate_df$estimate - (1.64 * microstate_df$std.error)
microstate_df$conf.high.1 <- microstate_df$estimate + (1.64 * microstate_df$std.error)


small_graph <- ggplot(microstate_df, aes(x = country_excluded, y = estimate)) +
  geom_point() +
  # 90% confidence interval (thicker lines)
  geom_pointrange(aes(ymin = conf.low.1, ymax = conf.high.1), 
                  position = position_dodge(width = 0.3), size = 0.75, linewidth = 1.25) +
  
  # 95% confidence interval (thinner lines)
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high), 
                  position = position_dodge(width = 0.3), size = 0.75, linewidth = 0.75) +
  theme_minimal() +
  coord_flip() +
  geom_hline(yintercept = 0, linetype ="dashed") +
  labs(title = "Microstate",
       x = "", y = "coefficient estimate") +
  theme(axis.text.y = element_text(size = 0)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) 
small_graph


# Filter results for middle_income
income_df <- results_df_brglm[results_df_brglm$variable == "middle_income_catYes",]

# Calculate confidence intervals
income_df$conf.low <- income_df$estimate - (1.96 * income_df$std.error)
income_df$conf.high <- income_df$estimate + (1.96 * income_df$std.error)
income_df$conf.low.1 <- income_df$estimate - (1.64 * income_df$std.error)
income_df$conf.high.1 <- income_df$estimate + (1.64 * income_df$std.error)

income_graph <- ggplot(income_df, aes(x = country_excluded, y = estimate)) +
  geom_point() +
  # 90% confidence interval (thicker lines)
  geom_pointrange(aes(ymin = conf.low.1, ymax = conf.high.1), 
                  position = position_dodge(width = 0.3), size = 0.75, linewidth = 1.25) +
  
  # 95% confidence interval (thinner lines)
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high), 
                  position = position_dodge(width = 0.3), size = 0.75, linewidth = 0.75) +
  
  theme_minimal() +
  coord_flip() +
  geom_hline(yintercept = 0, linetype ="dashed") +
  labs(title = "Middle income",
       x = "", y = "") +
  theme(axis.text.y = element_text(size = 0)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 
income_graph

# for presentation leave out common law
jackknife_graph <- ggarrange(haven_graph, small_graph, income_graph, ncol = 3, widths = c(0.4, 0.3, 0.3))
jackknife_graph

# save plot
ggsave("02_plots_tables/SM12.ROB_jackknife_glmbr.jpeg", dpi=600, height = 10)



###################################
##### USING V_DEM POLYARCHY
###################################

# read in again baseline datafile to ensure largest possible number of observations
# read the imputed dataset and add variables and labels  
# data file will be save into the project directory after running R script '01_preparation_data.R'
data_models_imp <- read.csv("01_data/cbi_dat_imp_cens.csv", 
                            stringsAsFactors = FALSE, na.strings = "")
data_models_imp[data_models_imp == "NA"] <- NA
# 10546 obs


# Data prep
data_models_imp$cbi_intro <- as.numeric(data_models_imp$cbi_intro)
data_models_imp$cbi_remov <- as.numeric(data_models_imp$cbi_remov)
data_models_imp$year <- as.numeric(data_models_imp$year)
data_models_imp$microstate_cat <- ifelse(data_models_imp$microstate == "1", "Yes", "No")
data_models_imp$late_indep_cat <- ifelse(data_models_imp$late_indep == "1", "Yes", "No")
data_models_imp$middle_income_cat <- ifelse(data_models_imp$income == "Lower middle income" | data_models_imp$income == "Upper middle income", "Yes", "No")
data_models_imp$v2x_regime_bin <- ifelse(data_models_imp$v2x_regime == "2" | data_models_imp$v2x_regime == "3", "Yes", "No")
data_models_imp$common_law_cat <- ifelse(data_models_imp$common_law == "1", "Yes", "No")
data_models_imp$haven_cat <- ifelse(data_models_imp$haven == "1", "Yes", "No")

# set numeric for SM5
data_models_imp$v2x_polyarchy <- as.numeric(data_models_imp$v2x_polyarchy)
data_models_imp$v2x_corr <- as.numeric(data_models_imp$v2x_corr)
data_models_imp$v2exbribe <- as.numeric(data_models_imp$v2exbribe)
data_models_imp$v2exembez <- as.numeric(data_models_imp$v2exembez)
data_models_imp$v2exthftps <- as.numeric(data_models_imp$v2exthftps)
data_models_imp$v2excrptps <- as.numeric(data_models_imp$v2excrptps)
data_models_imp$fh_total_reversed <- as.numeric(data_models_imp$fh_total_reversed)

# set key variables as factors
data_models_imp$haven_cat <- as.factor(data_models_imp$haven_cat)
data_models_imp$microstate_cat <- as.factor(data_models_imp$microstate_cat)
data_models_imp$middle_income_cat <- as.factor(data_models_imp$middle_income_cat)
data_models_imp$common_law_cat <- as.factor(data_models_imp$common_law_cat)
data_models_imp$v2x_regime_bin <- as.factor(data_models_imp$v2x_regime_bin)

# Set the reference level to "No" for both variables
# use factors as independent variables to ensure predicted probabilities are plotted correctly
data_models_imp$haven_cat <- relevel(as.factor(data_models_imp$haven_cat), ref = "No")
data_models_imp$microstate_cat <- relevel(as.factor(data_models_imp$microstate_cat), ref = "No")

# Code to reproduce SM5. Correlation measures regime characteristics from V-DEM and continuous FH measure
data_cor <- data_models_imp |>
  select(c(fh_total_reversed, v2x_polyarchy, v2x_corr, v2exbribe, v2exembez, v2excrptps, v2exthftps))

res <- cor(data_cor, use = "complete.obs")
res_rounded <- round(res, 2)
res_df <- as.data.frame(res_rounded) # Convert to data frame
res_df$Variable <- rownames(res_df) # Add a new column with row names
# prepare table
gt_table <- gt(res_df) %>%
  tab_header(
    title = "Correlation Matrix"
  ) %>%
  cols_label(
    Variable = "Variable"
  ) %>%
  cols_move_to_start(columns = c(Variable)) %>%
  tab_options(
    table.font.size = px(6)
  )

# export gt_table 
gt_table |> gtsave(filename = "02_plots_tables/SM5_correlation_matrix.tex")

# Corelation plot
jpeg("02_plots_tables/SM5.correlation.plot.jpeg",  width = 8, height = 8, units = 'in', res = 600)
corrplot(res, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)
chart.Correlation(data_cor, histogram=TRUE, pch=19)
dev.off()

# SM11: robustness check with VDEM measure

## we filter out the remaining missing values to ensure same N across models
data_models_imp <- data_models_imp |>
  filter(!is.na(v2x_regime_bin)) |>
  filter(!is.na(middle_income_cat))
summary(data_models_imp) #9668 obs

# BRGLM model to address limited DV with bias reduction
# use factors as independent variables to ensure predicted probabilities are plotted correctly
glmbr <- brglm(cbi_intro ~ haven_cat + microstate_cat +
                 middle_income_cat + common_law_cat + 
                 v2x_regime_bin +
                 year, 
               family=binomial(link="logit"), 
               method = "brglm.fit", p1 = T, data = data_models_imp)

# export regression output table
stargazer(glmbr, 
          type = "text", omit = "year",
          dep.var.caption = "Adoption of CBI",
          dep.var.labels.include = FALSE,
          header = F,
          title = "",
          #  keep.stat = c("n","ll","aic", "bic", "max.rsq"),
          add.lines = list(c("Year (linear effect)", "Y", "Y"), c("Country random intercept", "-", "Y")),
          covariate.labels = c("Tax haven", "Microstate", "Middle income country",
                               "Common law", "VD Democracy"),
          star.char = c("+", "*", "**", "***"),
          star.cutoffs = c(.1, .05, .01, .001),
          notes.append=FALSE,
          notes = c("+ p$<$0.1; * p$<$0.05; ** p$<$0.01; *** p$<$0.001"),
          out = c("02_plots_tables/SM11.ROB_models_glmbr_cbi_v-dem.txt", 
                  "02_plots_tables/SM11.ROB_models_glmbr_cbi_v-dem.tex"))


##### CBI generic ##### 

# read the censored dataset prepared for analysis of introduction generic investor policies
data_models_imp <- read.csv("01_data/cbi_dat_imp_generic_cens.csv", 
                            stringsAsFactors = FALSE, na.strings = "")
data_models_imp[data_models_imp == "NA"] <- NA
# 9591 obs

# Data prep
data_models_imp$cbi_gen_intro <- as.numeric(data_models_imp$cbi_gen_intro)
data_models_imp$cbi_gen_remov <- as.numeric(data_models_imp$cbi_gen_remov)
data_models_imp$year <- as.numeric(data_models_imp$year)
data_models_imp$microstate_cat <- ifelse(data_models_imp$microstate == "1", "Yes", "No")
data_models_imp$late_indep_cat <- ifelse(data_models_imp$late_indep == "1", "Yes", "No")
data_models_imp$middle_income_cat <- ifelse(data_models_imp$income == "Lower middle income" | data_models_imp$income == "Upper middle income", "Yes", "No")
data_models_imp$fh_status_free_cat <- ifelse(data_models_imp$fh_status == "Free", "Yes", "No")
data_models_imp$common_law_cat <- ifelse(data_models_imp$common_law == "1", "Yes", "No")
data_models_imp$haven_cat <- ifelse(data_models_imp$haven == "1", "Yes", "No")

# set key variables as factors
data_models_imp$haven_cat <- as.factor(data_models_imp$haven_cat)
data_models_imp$microstate_cat <- as.factor(data_models_imp$microstate_cat)
data_models_imp$middle_income_cat <- as.factor(data_models_imp$middle_income_cat)
data_models_imp$common_law_cat <- as.factor(data_models_imp$common_law_cat)
data_models_imp$fh_status_free_cat <- as.factor(data_models_imp$fh_status_free_cat)

# Set the reference level to "No" for both variables
# use factors as independent variables to ensure predicted probabilities are plotted correctly
data_models_imp$haven_cat <- relevel(as.factor(data_models_imp$haven_cat), ref = "No")
data_models_imp$microstate_cat <- relevel(as.factor(data_models_imp$microstate_cat), ref = "No")


## we filter out the remaining missing values to ensure same N across models
data_models_imp <- data_models_imp |>
  filter(!is.na(fh_status_free_cat)) |>
  filter(!is.na(middle_income_cat))
summary(data_models_imp) #8174 obs

# basic GLM model cbi_gen with RE for year and country
glmbr2 <- brglm(cbi_gen_intro ~ haven_cat + microstate_cat + middle_income_cat +
                 common_law_cat + fh_status_free_cat + 
                 year, 
               family=binomial(link="logit"), 
               method = "brglm.fit", p1 = T, data = data_models_imp)
summary(glmbr2)

# neat output table with dc
stargazer(glmbr2,
          type = "text", omit = "year",
          dep.var.caption = "Adoption of generic investment provision",
          dep.var.labels.include = FALSE,
          header = F,
          title = "",
          add.lines = list(c("Linear Year FE", "Y")),
          covariate.labels = c("Tax haven", "Microstate", "Middle income country",
                               "Common law", "Free (FH status)"),
          star.char = c("+", "*", "**", "***"),
          star.cutoffs = c(.1, .05, .01, .001),
          notes.append=FALSE,
          notes = c("+ p$<$0.1; * p$<$0.05; ** p$<$0.01; *** p$<$0.001"),
          out = c("02_plots_tables/SM8.Rob_models_glmbr_cbi_gen_intro.txt", 
                  "02_plots_tables/SM8.Rob_models_glmbr_cbi_gen_intro.tex"))


## plot in Figure 5
# TAX HAVEN
e1 <- ggeffect(glmbr2, "haven_cat")
e1

#calculate AME's and significance
marginal_haven <- summary(margins(glmbr2, variables = "haven_cat", type = "link"))
marginal_haven
# factor    AME     SE      z      p   lower  upper
# haven_catYes -0.0194 0.7077 -0.0275 0.9781 -1.4065 1.3676

p1 <- plot(e1) + ggtitle("Tax haven") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , round(exp(marginal_haven$AME), 2), sep = "")))
p1


# Microstate
e2 <- ggeffect(glmbr2, "microstate_cat")

marginal_small <- summary(margins(glmbr2, variables = "microstate_cat", type = "link"))
marginal_small
# factor    AME     SE      z      p   lower  upper
# microstate_catYes 0.3443 0.5547 0.6207 0.5348 -0.7429 1.4315

p2 <- plot(e2) + ggtitle("Microstate") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , round(exp(marginal_small$AME), 2), sep = "")))
p2

# MIDDLE INCOME
e3 <- ggeffect(glmbr2, "middle_income_cat")

marginal_income <- summary(margins(glmbr2, variables = "middle_income_cat", type = "link"))
marginal_income
# factor    AME     SE      z      p   lower  upper
# middle_income_catYes 0.5344 0.4698 1.1375 0.2553 -0.3864 1.4552

p3 <- plot(e3) + ggtitle("Middle income country") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , round(exp(marginal_income$AME), 2), sep = "")))
p3


# COMMON LAW
e4 <- ggeffect(glmbr2, "common_law_cat")

marginal_law <- summary(margins(glmbr2, variables = "common_law_cat", type = "link"))
marginal_law
# factor            AME     SE       z      p       lower   upper
# common_law_catYes -1.1336 0.5869 -1.9315 0.0534 -2.2839 0.0167

p4 <- plot(e4) + ggtitle("Common law tradition") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , round(exp(marginal_law$AME), 2), sep = "+")))
p4


## FREEDOM HOUSE
e5 <- ggeffect(glmbr2, "fh_status_free_cat")

marginal_fh <- summary(margins(glmbr2, variables = "fh_status_free_cat", type = "link"))
marginal_fh
# factor    AME     SE      z      p   lower  upper
# fh_status_free_catYes 0.0122 0.4977 0.0246 0.9804 -0.9633 0.9878

p5 <- plot(e5) + ggtitle("Free (FH status)") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1.4, y = 0.0125, label = paste0("AME(exp) = " , round(exp(marginal_fh$AME), 2), sep = "")))
p5


# YEAR
e6 <- ggeffect(glmbr2, "year")

marginal_year <- summary(margins(glmbr2, variables = "year", type = "link"))
marginal_year
# factor     AME     SE       z      p   lower  upper
# year -0.0149 0.0146 -1.0187 0.3083 -0.0435 0.0137

p6 <- plot(e6) + ggtitle("Year") +
  ylab("") +
  xlab(c("")) +
  scale_y_continuous(trans='log10', labels = scales::label_percent()) +
  geom_text(aes(x = 1990, y = .0075, label = paste0("AME(exp) = " , round(exp(marginal_year$AME), 2), sep = "")))
p6

# combine
patchwork::wrap_plots(p1, p2, p3, p4, p5, p6) + patchwork::plot_annotation(
  title = "Predicted probability of introduction generic investment provision for citizenship",
  subtitle = "Marginal predictions at the mean (and average marginal effect, exponentiated)",
  caption = "difference between predicted probabilities significant at +p<.1 *p<.05 **p<.01 ***p<.001"
)

ggsave("02_plots_tables/Fig5.coef_plot_ROB_model_cbi_gen_intro_glmbr.jpeg", dpi=600, units = "cm", width = 20)


### END ###

